SBA loan data analysis

1. Setting up and loading/cleaning data

1.1 Load libraries, mappings, and data

library(tidyverse)
library(geomtextpath)
library(plotly)
library(htmlwidgets)
library(maps)
library(dplyr)
df <- read.csv("SBAnational.csv")
naics_mapping <- list(
  "11" = "Agriculture, Forestry, Fishing and Hunting",
  "21" = "Mining, Quarrying, and Oil and Gas Extraction",
  "22" = "Utilities",
  "23" = "Construction",
  "31" = "Manufacturing",
  "32" = "Manufacturing",
  "33" = "Manufacturing",
  "42" = "Wholesale Trade",
  "44" = "Retail Trade",
  "45" = "Retail Trade",
  "48" = "Transportation and Warehousing",
  "49" = "Transportation and Warehousing",
  "51" = "Information",
  "52" = "Finance and Insurance",
  "53" = "Real Estate and Rental and Leasing",
  "54" = "Professional, Scientific, and Technical Services",
  "55" = "Management of Companies and Enterprises",
  "56" = "Administrative and Support Services",
  "61" = "Educational Services",
  "62" = "Health Care and Social Assistance",
  "71" = "Arts, Entertainment, and Recreation",
  "72" = "Accommodation and Food Services",
  "81" = "Other Services",
  "92" = "Public Administration"
)

1.2 Filter out missing variables and limit data range to 2000-2010

df <- df %>%
  filter(df$MIS_Status != "", df$GrAppv != "", df$SBA_Appv != "",
         df$NAICS != 0, df$State != "", df$NoEmp <= 1500,
         df$LowDoc == "Y" | df$LowDoc == "N", df$Term > 0,
         df$ApprovalFY < 2011 & df$ApprovalFY > 1999, df$NewExist != 0,
         df$RevLineCr == "Y" | df$RevLineCr == "N", df$UrbanRural > 0)

df$GrAppv <- as.numeric(gsub("[$,]", "", df$GrAppv))
df$SBA_Appv <- as.numeric(gsub("[$,]", "", df$SBA_Appv))
df$DisbursementGross <- as.numeric(gsub("[$,]", "", df$DisbursementGross))
df$ApprovalFY <- as.numeric(df$ApprovalFY)

1.3 Convert variables as needed and select desired variables

df <- df %>%
  mutate(RealEstate = ifelse(df$Term >= 240, 1, 0)) %>%
  mutate(MIS_Status = ifelse(df$MIS_Status == "CHGOFF", 1, 0)) %>%
  mutate(FranchiseCode = ifelse(df$FranchiseCode > 1, 1, 0)) %>%
  mutate(UrbanRural = ifelse(df$UrbanRural == 1, 1, 0)) %>%
  mutate(NewExist = ifelse(df$NewExist == 1, 1, 0)) %>%
  mutate(RevLineCr = ifelse(df$RevLineCr == "Y", 1, 0)) %>%
  mutate(LowDoc = ifelse(df$LowDoc == "Y", 1, 0)) %>%
  mutate(NAICS = as.integer(NAICS / 10000)) %>%
  mutate(Industry = as.character(naics_mapping[as.character(NAICS)])) %>%
  mutate(Proportion = SBA_Appv / GrAppv) %>%
  mutate(Recession = ifelse(df$ApprovalFY > 2007 & df$ApprovalFY <= 2009,
                            1, 0))
df <- df %>%
  select(State, Industry, FranchiseCode, UrbanRural,
         Term, NoEmp, NewExist, RevLineCr, Recession,
         RealEstate, DisbursementGross, LowDoc, NoEmp, Proportion, MIS_Status)

1.4 Recession anaylsis

Determine whether or not data that occurred in the recession is skewed enough to warrant excluding it from the data set. Consider p-value < 0.05 significant

recession <- df %>%
  select(MIS_Status, Recession)
chisq.test(table(recession))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(recession)
## X-squared = 2055.9, df = 1, p-value < 2.2e-16
recession_summary <- recession %>%
  group_by(Recession, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(Recession) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

recession_plot <- ggplotly(
  ggplot(recession_summary, aes(
    x = factor(Recession),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "Loan Occured during the Recession: ", ifelse(Recession == 1, "Yes",
                                                    "No"),
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
    labs(
      title = "Loan Occured during the Recession vs Default Rate",
      x = "Loan Occured during the Recession",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)

recession_plot

With a p-value <0.05 and there being a significant difference in default rate between loans that occurred during the Great Recession and loans that didn’t occur during the Great Recession we will exclude the data from the Great Recession

df <- df %>%
  filter(df$Recession != 1)
df <- df %>% select(-Recession)

2. Statistcal Analysis

2.1 State

state_mapping <- setNames(state.name, state.abb)
state_test <- df %>%
  select(State, MIS_Status)
chisq.test(table(state_test))
## 
##  Pearson's Chi-squared test
## 
## data:  table(state_test)
## X-squared = 5696.8, df = 50, p-value < 2.2e-16
state <- df %>%
  mutate(StateName = tolower(state_mapping[State])) %>%
  group_by(StateName) %>%
  summarise(default_rate = mean(MIS_Status == 1))
map <- map_data("state")
map <- left_join(map, state, by = c("region" = "StateName"))
state_plot <- ggplotly(
  ggplot(map, aes(x = long, y = lat, group = group, fill = default_rate,
    text = paste(
      "State: ", tools::toTitleCase(region),
      "<br>Default Rate: ", scales::percent(default_rate, accuracy = 0.01)
    )
  )) +
    geom_polygon(color = "white") +
    scale_fill_gradient(low = "lightblue", high = "red",
                        labels = scales::percent) +
    labs(
      title = "Loan Default Rate by State",
      fill = "Default Rate (%)"
    ),
  tooltip = "text"
)
state_plot

2.2 Industry

industry <- df %>%
  select(MIS_Status, Industry)
chisq.test(table(industry))
## 
##  Pearson's Chi-squared test
## 
## data:  table(industry)
## X-squared = 3347.4, df = 19, p-value < 2.2e-16

Create two summaries, one that includes both default and paid in full rates for graphing, one that includes only default rate for later use.

industry_summary <- industry %>%
  group_by(Industry, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(Industry) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

industry_summary2 <- industry %>%
  group_by(Industry) %>%
  summarise(default_rate = mean(MIS_Status == 1))
industry_plot <- ggplotly(
  ggplot(industry_summary, aes(
    x = factor(Industry),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "Industry: ", Industry,
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(
      title = "Industry vs Default Rate",
      x = "Industry",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)
industry_plot

2.3 Real Estate

real_estate <- df %>%
  select(MIS_Status, RealEstate)
chisq.test(table(real_estate))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(real_estate)
## X-squared = 6830.5, df = 1, p-value < 2.2e-16
real_estate_summary <- real_estate %>%
  group_by(RealEstate, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(RealEstate) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

real_estate_plot <- ggplotly(
  ggplot(real_estate_summary, aes(
    x = factor(RealEstate),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "Backed by Real Estate: ", ifelse(RealEstate == 1, "Yes", "No"),
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
    labs(
      title = "Real Estate vs Default Rate",
      x = "Backed By Real Estate",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)

real_estate_plot

2.4 Franchise

franchise <- df %>%
  select(MIS_Status, FranchiseCode)
chisq.test(table(franchise))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(franchise)
## X-squared = 257.88, df = 1, p-value < 2.2e-16
franchise_summary <- franchise %>%
  group_by(FranchiseCode, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(FranchiseCode) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

franchise_plot <- ggplotly(
  ggplot(franchise_summary, aes(
    x = factor(FranchiseCode),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "Is Franchise: ", ifelse(FranchiseCode == 1, "Yes", "No"),
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
    labs(
      title = "Is Franchise vs Default Rate",
      x = "Is Franchise",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)

franchise_plot

2.5 New vs Existing businesses

new <- df %>%
  select(MIS_Status, NewExist)
chisq.test(table(new))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(new)
## X-squared = 27.349, df = 1, p-value = 1.698e-07
new_summary <- new %>%
  group_by(NewExist, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(NewExist) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

new_plot <- ggplotly(
  ggplot(new_summary, aes(
    x = factor(NewExist),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "Existed for more than 2 years: ", ifelse(NewExist == 1, "Yes", "No"),
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
    labs(
      title = "Is Existing vs Default Rate",
      x = "Is Existing",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)

new_plot

2.6 Revolving Line of Credit

credit <- df %>%
  select(MIS_Status, RevLineCr)
chisq.test(table(credit))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(credit)
## X-squared = 68.8, df = 1, p-value < 2.2e-16
credit_summary <- credit %>%
  group_by(RevLineCr, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(RevLineCr) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

credit_plot <- ggplotly(
  ggplot(credit_summary, aes(
    x = factor(RevLineCr),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "Has Revolving Line of Credit: ", ifelse(RevLineCr == 1, "Yes", "No"),
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
    labs(
      title = "Has Revolving Line of Credit vs Default Rate",
      x = "Has Revolving Line of Credit",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)

credit_plot

2.7 Urban vs Rural

urban <- df %>%
  select(MIS_Status, UrbanRural)
chisq.test(table(urban))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(urban)
## X-squared = 771.6, df = 1, p-value < 2.2e-16
urban_summary <- urban %>%
  group_by(UrbanRural, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(UrbanRural) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

urban_plot <- ggplotly(
  ggplot(urban_summary, aes(
    x = factor(UrbanRural),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "Urban or Rural: ", ifelse(UrbanRural == 1, "Urban", "Rural"),
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    scale_x_discrete(labels = c("0" = "Rural", "1" = "Urban")) +
    labs(
      title = "Ubran/Rural vs Default Rate",
      x = "Ubran/Rural",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)

urban_plot

2.8 LowDoc loan program

lowdoc <- df %>%
  select(MIS_Status, LowDoc)
chisq.test(table(lowdoc))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(lowdoc)
## X-squared = 59.741, df = 1, p-value = 1.082e-14
lowdoc_summary <- lowdoc %>%
  group_by(LowDoc, MIS_Status) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(LowDoc) %>%
  mutate(percentage = count / sum(count)) %>%
  ungroup()

lowdoc_plot <- ggplotly(
  ggplot(lowdoc_summary, aes(
    x = factor(LowDoc),
    y = percentage,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
    text = paste(
      "In LowDoc program: ", ifelse(LowDoc == 1, "Yes", "No"),
      "<br>Default Status: ", ifelse(MIS_Status == 1, "Defaulted",
                                     "Paid In Full"),
      "<br>Percentage: ", scales::percent(percentage, accuracy = 0.01)
    )
  )) +
    geom_bar(stat = "identity", position = "fill", alpha = 0.75) +
    scale_y_continuous(labels = scales::percent) +
    scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
    labs(
      title = "In LowDoc Program vs Default Rate",
      x = "In LowDoc Programt",
      y = "Percentage",
      fill = "Default Status"
    ),
  tooltip = "text"
)

lowdoc_plot

2.9 SBA gauranteed proportion

sba_prop <- df %>%
  select(MIS_Status, Proportion)
t.test(Proportion ~ MIS_Status, data = sba_prop, alternative = "two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  Proportion by MIS_Status
## t = 53.883, df = 163065, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.03112824 0.03347829
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5972755       0.5649723
sba_prop_plot <- ggplotly(
  ggplot(sba_prop, aes(
    x = Proportion,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
                   text = paste(
                     "Term: ", after_stat(xmin), " to ", after_stat(xmax),
                     "<br>Count: ", after_stat(count)
                   ))) +
    geom_histogram(
                   position = "dodge",
                   binwidth = 0.25,
                   boundary = 0,
                   alpha = 0.7) +
    xlim(0, max(sba_prop$Proportion)) +
    facet_wrap(~ MIS_Status,
               scales = "free_y",
               labeller = as_labeller(c("0" = "Paid In Full",
                                        "1" = "Defaulted"))) +
    labs(
      title = "Histogram of loan portion gauranteed by MIS Status",
      x = "Porportion of loan gauranteed by SBA",
      y = "Count",
      fill = "Default Status"
    ),
  tooltip = "text"
)
sba_prop_plot

2.10 Gross Disbursement

Find quartile for data to make a barchart that displays the distribution of gross disbursement by quartile

disbur <- df %>%
  mutate(
    Quartile = cut(
      DisbursementGross,
      breaks = quantile(DisbursementGross, probs = seq(0, 1, by = 0.25)),
      include.lowest = TRUE,
      labels = c("0-25%", "25-50%", "50-75%", "75-100%")
    )
  ) %>%
  select(MIS_Status, DisbursementGross, Quartile)
t.test(DisbursementGross ~ MIS_Status, data = disbur,
       alternative = "two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  DisbursementGross by MIS_Status
## t = 77.279, df = 215958, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  64680.67 68046.94
## sample estimates:
## mean in group 0 mean in group 1 
##        168989.3        102625.5
disbur_prop <- disbur %>%
  group_by(MIS_Status, Quartile) %>%
  summarise(Count = n()) %>%
  group_by(MIS_Status) %>%
  mutate(Proportion = Count / sum(Count))
## `summarise()` has grouped output by 'MIS_Status'. You can override using the
## `.groups` argument.
disbur_plot <- ggplotly(
  ggplot(disbur_prop, aes(x = Quartile, y = Proportion, fill = Quartile,
   text = paste(
    "Quartile: ", Quartile,
    "<br>Percentage: ", scales::percent(Proportion, accuracy = 0.01)
  ))) +
    geom_bar(stat = "identity", alpha = 0.75) +
    facet_wrap(~ MIS_Status, labeller =
                 as_labeller(c(`0` = "Paid In Full", `1` = "Default"))) +
    scale_y_continuous(labels = scales::percent) +
    labs(
      title = "Gross Disbursement Distribution by Quartiles",
      x = "Gross Disbursement Quartile",
      y = "Proportion",
      fill = "Quartile"
    ) +
    theme_minimal() +
    theme(
      legend.position = "bottom",
      strip.text = element_text(face = "bold"),
      plot.title = element_text(hjust = 0.5)
    ),
  tooltip = "text"
)
disbur_plot

2.11 Number of employees

Do same process as done with the gross disbursement

emp <- df %>%
  mutate(
    Quartile = cut(
      NoEmp,
      breaks = quantile(NoEmp, probs = seq(0, 1, by = 0.25)),
      include.lowest = TRUE,
      labels = c("0-25%", "25-50%", "50-75%", "75-100%")
    )
  ) %>%
  select(MIS_Status, NoEmp, Quartile)
t.test(NoEmp ~ MIS_Status, data = emp,
       alternative = "two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  NoEmp by MIS_Status
## t = 51.518, df = 236308, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  2.899522 3.128870
## sample estimates:
## mean in group 0 mean in group 1 
##        8.684455        5.670259
emp_prop <- emp %>%
  group_by(MIS_Status, Quartile) %>%
  summarise(Count = n()) %>%
  group_by(MIS_Status) %>%
  mutate(Proportion = Count / sum(Count))
## `summarise()` has grouped output by 'MIS_Status'. You can override using the
## `.groups` argument.
emp_plot <- ggplotly(
  ggplot(emp_prop, aes(x = Quartile, y = Proportion, fill = Quartile,
  text = paste (
    "Quartile: ", Quartile,
    "<br>Percentage: ", scales::percent(Proportion, accuracy = 0.01)
  ))) +
    geom_bar(stat = "identity", alpha = 0.75) +
    facet_wrap(~ MIS_Status, labeller =
                 as_labeller(c(`0` = "Paid In Full", `1` = "Default"))) +
    scale_y_continuous(labels = scales::percent) +
    labs(
      title = "Number of employee Distribution by Quartiles",
      x = "Disbursement Quartile",
      y = "Proportion",
      fill = "Quartile"
    ) +
    theme_minimal() +
    theme(
      legend.position = "bottom",
      strip.text = element_text(face = "bold"),
      plot.title = element_text(hjust = 0.5)
    ),
  tooltip = "text"
)
emp_plot

2.13 Term Length

term <- df %>%
  select(MIS_Status, Term)
t.test(Term ~ MIS_Status, data = term, alternative = "two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  Term by MIS_Status
## t = 272.38, df = 233311, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  48.46045 49.16292
## sample estimates:
## mean in group 0 mean in group 1 
##        94.92662        46.11493

Filter few data points to make graph more balanced

term <- term %>%
  filter(Term <= 300)
term_plot <- ggplotly(
  ggplot(term, aes(
    x = Term,
    fill = factor(MIS_Status, labels = c("Paid In Full", "Defaulted")),
                   text = paste(
                     "Term: ", after_stat(xmin), " to ", after_stat(xmax),
                     "<br>Count: ", after_stat(count)
                   ))) +
    geom_histogram(
                   position = "dodge",
                   binwidth = 25,
                   boundary = 0,
                   alpha = 0.7) +
    xlim(0, max(term$Term)) +
    facet_wrap(~ MIS_Status,
               scales = "free_y",
               labeller = as_labeller(c("0" = "Paid In Full",
                                        "1" = "Defaulted"))) +
    labs(
      title = "Histogram of Term by MIS Status",
      x = "Term",
      y = "Count",
      fill = "Default Status"
    ),
  tooltip = "text"
)
term_plot